# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
for filename in filenames:
print(os.path.join(dirname, filename))
# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All"
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session
#EDA
!pip install pandas-profiling
from pandas_profiling import ProfileReport
**Problem Statement**
The sinking of the Titanic is one of the most infamous shipwrecks in history.
On April 15, 1912, during her maiden voyage, the widely considered “unsinkable” RMS Titanic sank after colliding with an iceberg. Unfortunately, there weren’t enough lifeboats for everyone onboard, resulting in the death of 1502 out of 2224 passengers and crew.
While there was some element of luck involved in surviving, it seems some groups of people were more likely to survive than others.
In this challenge, we ask you to build a predictive model that answers the question: “what sorts of people were more likely to survive?” using passenger data (ie name, age, gender, socio-economic class, etc).
# Scientific Libraries
!pip install plotly xgboost
import pandas as pd
import numpy as np
# Visualization
import matplotlib.pyplot as plt
import matplotlib.style as style
import seaborn as sns
style.use('fivethirtyeight')
# Plotly
import plotly.express as px
import plotly.graph_objs as go
import plotly.offline as py
# Booster for rendering
import plotly.io as pio
pio.renderers.default = 'iframe'
# Warning ignorance
import warnings
warnings.filterwarnings('ignore')
# Scaling
from sklearn.preprocessing import MinMaxScaler, StandardScaler
# Selection
from scipy.stats import chi2_contingency
# Machine Learning Algorithm
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score,StratifiedKFold
# Metrics Evaluation
from sklearn.metrics import accuracy_score, precision_score
from sklearn.metrics import recall_score, f1_score
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
# Hyperparameter Tuning
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import os, sys
cwd = os.getcwd()
print("Current working directory:", cwd)
Current working directory: C:\Users\Lincoln-20150345\Documents\Jupyter Notebook
# Import Train Dataset
path_train = os.getcwd() + '/train.csv'
train = pd.read_csv(path_train)
# Import Test Dataset
path_test = os.getcwd() + '/test.csv'
test = pd.read_csv(path_test)
df = pd.DataFrame(data=train)
pr = ProfileReport(df)
pr.to_file(output_file='pandas_profiling.html')
pr
# Numerical data
nums = ['PassengerId', 'Age','SibSp', 'Parch', 'Fare']
# Categorical data
cats =['Survived','Pclass','Name', 'Sex', 'Ticket',
'Cabin', 'Embarked']
# Copy data for pre processing
pre1 = train.copy()
# Numerical data
num = ['Age','SibSp', 'Parch', 'Fare']
# Categorical data
cat =['Survived','Pclass', 'Sex', 'Ticket',
'Embarked']
# Drop PassengerID & Name
pre1.drop(['PassengerId','Name','Ticket'],1, inplace=True)
There are 3 features with the missing values, such as Age, Cabin, Embarked.
**Train Data**
# Handle Feature Cabin & Embarked
# Due to cabin has the highest ratio of missing values, I'll drop it
# And also only 2 missing values for embarked
pre1.drop('Cabin',1, inplace=True)
pre1.dropna(subset=['Embarked'],inplace=True)
# Impute Age
pre1['Age'].fillna(pre1['Age'].mean(),inplace=True)
# Replace 0 values in Fare with median due to the distribution is skewed
median_fare = pre1['Fare'].median(skipna=True)
pre1['Fare']=pre1.Fare.mask(pre1.Fare == 0,median_fare)
# Check the missing values
feature = pre1.isna().sum().keys().tolist()
missing = pre1.isna().sum().values.tolist()
mv_check = pd.DataFrame(list(zip(feature, missing)),
columns=['feature','missing_value'])
mv_check['%missing'] = round(((mv_check['missing_value']/train.shape[0])*100),2)
mv_check
| feature | missing_value | %missing | |
|---|---|---|---|
| 0 | Survived | 0 | 0.0 |
| 1 | Pclass | 0 | 0.0 |
| 2 | Sex | 0 | 0.0 |
| 3 | Age | 0 | 0.0 |
| 4 | SibSp | 0 | 0.0 |
| 5 | Parch | 0 | 0.0 |
| 6 | Fare | 0 | 0.0 |
| 7 | Embarked | 0 | 0.0 |
No missing values for all features & zero values in Fare
**Test Data**
# Handle Feature Cabin & Age
# Due to cabin has the highest ratio of missing values, I'll drop it
# For age, I'll impute with mean
test1 = test.copy()
test1.drop('Cabin',1, inplace=True)
# Impute Age
test1['Age'].fillna(test['Age'].mean(),inplace=True)
test1['Fare'].fillna(test['Fare'].median(),inplace=True)
test1.isna().sum()
PassengerId 0 Pclass 0 Name 0 Sex 0 Age 0 SibSp 0 Parch 0 Ticket 0 Fare 0 Embarked 0 dtype: int64
No missing values in test dataset
# Check data distribution
plt.figure(figsize=(8,5))
for i in range(0,len(num)):
plt.subplot(2, len(num)/2, i+1)
sns.distplot(pre1[num[i]], color='orange')
plt.tight_layout()
# Apply log transformation
log = ['SibSp','Parch','Fare']
for col in log:
pre1[col] = (pre1[col]+1).apply(np.log)
# Apply standardization
for col in num:
pre1[col]= StandardScaler().fit_transform(pre1[col].values.reshape(len(pre1), 1))
# Train dataset
cats_train = ['Sex','Embarked']
for cat in cats_train:
onehots_train = pd.get_dummies(pre1[cat], prefix=cat)
pre1 = pre1.join(onehots_train)
pre1.drop(columns=cats_train,inplace=True)
# Test dataset
cats_test = ['Sex','Embarked']
for cat in cats_test:
onehots_test = pd.get_dummies(test1[cat], prefix=cat)
test1 = test1.join(onehots_test)
X_train = pre1.drop('Survived',1)
y_train = pre1['Survived']
X_test = test1.drop(['Name','Ticket','PassengerId','Sex','Embarked'],1)
# Create function for cross validation score
def cross_val(Model, X_train, y_train, cval):
model = Model # initiate model
kfold = StratifiedKFold(n_splits=cval, random_state=1, shuffle=True)
cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring='accuracy')
cv_mean = cv_results.mean()
cv_std = cv_results.std()
return round(cv_mean,4), round(cv_std,4)
# Inititate algorithm
lr = LogisticRegression(random_state=42)
nb = GaussianNB()
dt = DecisionTreeClassifier(random_state=42)
rf = RandomForestClassifier(random_state=42)
knn = KNeighborsClassifier(n_neighbors=5)
svc = SVC(random_state=42)
xgb = XGBClassifier(random_state=42)
# Create function to make the result as dataframe
def model_cv_comparison(X_train,y_train):
# Logistic Regression
lr_cv_score_mean, lr_cv_score_std = cross_val(lr, X_train, y_train, 10)
# Naive Bayes
nb_cv_score_mean, nb_cv_score_std = cross_val(nb, X_train, y_train, 10)
# Decision Tree
dt_cv_score_mean, dt_cv_score_std = cross_val(dt, X_train, y_train, 10)
# Random Forest
rf_cv_score_mean, rf_cv_score_std = cross_val(lr, X_train, y_train, 10)
#KNN
knn_cv_score_mean, knn_cv_score_std = cross_val(lr, X_train, y_train, 10)
# SVC
svc_cv_score_mean, svc_cv_score_std = cross_val(lr, X_train, y_train, 10)
# XGBoost
xgb_cv_score_mean, xgb_cv_score_std = cross_val(lr, X_train, y_train, 10)
models = ['Logistic Regression','Naive Bayes','Decision Tree','Random Forest',
'KNN','SVC','XGBoost']
cv_mean = [lr_cv_score_mean, nb_cv_score_mean, dt_cv_score_mean, rf_cv_score_mean,
knn_cv_score_mean, svc_cv_score_mean, xgb_cv_score_mean]
cv_std = [lr_cv_score_std, nb_cv_score_std, dt_cv_score_std, rf_cv_score_std,
knn_cv_score_std, svc_cv_score_std, xgb_cv_score_std]
model_comparison = pd.DataFrame(data=[models, cv_mean, cv_std]).T.rename(
{0: 'Model',
1: 'CV_Mean',
2: 'CV_Stdev'}, axis=1)
return model_comparison
model_cv_comparison(X_train,y_train)
| Model | CV_Mean | CV_Stdev | |
|---|---|---|---|
| 0 | Logistic Regression | 0.7919 | 0.0285 |
| 1 | Naive Bayes | 0.7761 | 0.0371 |
| 2 | Decision Tree | 0.7874 | 0.0272 |
| 3 | Random Forest | 0.7919 | 0.0285 |
| 4 | KNN | 0.7919 | 0.0285 |
| 5 | SVC | 0.7919 | 0.0285 |
| 6 | XGBoost | 0.7919 | 0.0285 |
The highest average cross validation score is 0.7919 & 0.0285 for standar deviation. The simpler algorithm the better, so I will use logistic regression.
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test.PassengerId, 'Survived': predictions})
output.to_csv('my_submission.csv', index=False)
print("My submission was successfully saved!")
My submission was successfully saved!